# Rural livelihoods and the footprint of the coal industry in Jharkhand, India. 
# [ANON]

# SAMPLING SCRIPT --------------------------------------------------------------

# Load Necessary Packages ------------------------------------------------------

library(pacman)

# Tidyverse
p_load(dplyr, readr, stringr, tidyr, ggplot2, readxl, patchwork, ggforce)

# Spatial
p_load(sf, mapview)

# Miscellaneous 
p_load(here)

# Load Data --------------------------------------------------------------------

# Set CRS

crs <- "EPSG:7765"

# State boundary
state_sf <- st_read(here("Data","Spatial", "gadm36_IND_1.shp")) %>% 
  select(state = NAME_1) %>% 
  filter(state == "Jharkhand") %>% 
  st_sf() %>% 
  st_transform(crs = crs)

# Revenue village centroids

# vill_sf <- st_read(here("Data","Spatial", "fVillPts2011c.shp")) %>% 
#   st_transform(crs = crs) %>% 
#   st_join(state_sf, left = FALSE)
# 
# st_write(vill_sf, here("Data","Spatial", "villages_jk.shp"))

vill_sf <- st_read(here("Data","Spatial", "villages_jk.shp"))

# Mine locations
mines_sf <- read_xlsx(here("Data", "Spatial", 
                           "Indian Coal Mines Dataset_January 2021-1.xlsx"), 
                      sheet = 2) %>% 
  select(-Source) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = "EPSG:4326") %>% 
  st_transform(crs = crs) %>% 
  st_join(state_sf, left = FALSE)

# SHRUG datasets
vill_pc <- read.csv(here("Data", "Statistical",
                         "shrug-v1.5.samosa-pop-econ-census-csv",
                         "shrug-v1.5.samosa-pop-econ-census-csv",
                         "shrug_pc11.csv"))

vill_ec <- read.csv(here("Data", "Statistical",
                         "shrug-v1.5.samosa-pop-econ-census-csv",
                         "shrug-v1.5.samosa-pop-econ-census-csv",
                         "shrug_ec.csv"))

vill_keys <- read.csv(here("Data", "Statistical",
                           "shrug-v1.5.samosa-pop-econ-census-csv",
                           "shrug-v1.5.samosa-keys-csv",
                           "shrug_pc11_subdistrict_key.csv"))

vill_pc <- vill_pc %>% 
  transmute(shrid = shrid,
            CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
            hhs_11 = pc11_pca_no_hh,
            pop_11 = pc11_pca_tot_p,
            pop_rural_11 = pc11_pca_tot_p_r,
            pop_urban_11 = pc11_pca_tot_p_u,
            pop_sc_11 = pc11_pca_p_sc,
            pop_st_11 = pc11_pca_p_st,
            villarea_11 = pc11_vd_area)

vill_ec <- vill_ec %>% 
  transmute(shrid = shrid,
            CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
            emp_all_13 = ec13_emp_all,
            emp_manuf_13 = ec13_emp_manuf,
            emp_serv_13 = ec13_emp_services)

vill_keys <- vill_keys %>% 
  transmute(shrid = shrid,
            CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
            district = pc11_district_name,
            subdistrict = pc11_subdistrict_name
  )

vill_pcec <- left_join(vill_pc, vill_ec) %>% 
  left_join(vill_keys) %>% 
  filter(CENSUS2011 %in% vill_sf$CENSUS2011)

vill_sf <- left_join(vill_sf, vill_pcec)

# Filter out urban revenue villages (CENSUS2011 ID starts with 8)
vill_sf <- vill_sf %>% 
  filter(!str_starts(CENSUS2011, "8"))

options(scipen = 999)

# Calculate coal production by distance ----------------------------------------

vill_mine_5k <- st_is_within_distance(vill_sf, mines_sf, dist = 5000)
vill_mine_10k <- st_is_within_distance(vill_sf, mines_sf, dist = 10000)
vill_mine_20k <- st_is_within_distance(vill_sf, mines_sf, dist = 20000)

vill_coalprod_df <- tibble(
  CENSUS2011 =
    vill_sf$CENSUS2011,
  coalprod5k = sapply(vill_mine_5k,
                      function(x)
                        sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x])),
  coalprod10k = sapply(vill_mine_10k,
                       function(x)
                         sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x])),
  coalprod20k = sapply(vill_mine_20k,
                       function(x)
                         sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x]))
)

vill_sf <- vill_sf %>% 
  left_join(vill_coalprod_df)

# Sample frame -----------------------------------------------------------------

# Prepare sample frame, removing revenue villages with rural population below 100
sampleframe <- vill_sf %>% 
  tibble() %>% 
  mutate(samplestrata = 
           case_when(
             coalprod5k > 5 ~ "5km_5MT",
             coalprod5k > 0 ~ "5km_any",
             coalprod10k > 5 ~ "10km_5MT",
             coalprod10k > 0 ~ "10km_any",
             coalprod20k > 5 ~ "20km_5MT",
             coalprod20k > 0 ~ "20km_any",
             TRUE ~ "NOT SAMPLED"
           ),
         samplestrata = factor(samplestrata, levels = c("5km_5MT", "5km_any",
                                                        "10km_5MT", "10km_any",
                                                        "20km_5MT", "20km_any", 
                                                        "NOT SAMPLED")),
         distancestrata = str_extract(samplestrata, pattern = "[^_]+"),
         distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km")),
         coalprod20k = coalprod20k - coalprod10k,
         coalprod10k = coalprod10k - coalprod5k
  ) %>% 
  filter(!is.na(pop_rural_11) & pop_rural_11 >= 100)

# Plot sample frame villages relative to all by district
sampleframe %>% 
  filter(!is.na(district)) %>%
  group_by(district) %>% 
  mutate(district_samplepop = sum(pop_11 * as.numeric(samplestrata != "NOT SAMPLED"))) %>% 
  ggplot(aes(forcats::fct_rev(forcats::fct_reorder(district, district_samplepop, .desc = T)), 
             pop_11, fill = forcats::fct_rev(samplestrata))) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c('#737373','#c6dbef','#9ecae1','#6baed6','#4292c6','#2171b5','#084594')) +
  labs(fill = "Sample Strata",
       x = NULL)

sampleframe <- sampleframe %>% 
  filter(!is.na(shrid), samplestrata != "NOT SAMPLED") %>% 
  select(shrid, CENSUS2011, state, district, subdistrict,
         distancestrata, samplestrata, 
         coalprod5k,coalprod10k, coalprod20k,
         hhs_11:emp_serv_13, geometry) %>% 
  arrange(samplestrata) %>% 
  st_as_sf() %>% 
  st_transform(crs = "EPSG:4326")

# Plot distribution of revenue village populations
sampleframe %>% 
  ggplot(aes(pop_11)) +
  geom_histogram(bins = 100) +
  scale_x_log10(breaks = c(300, 1000, 3000, 10000))

# Calculate proportion of households in each distance strata that live in 
# revenue villages with either >= | < 5MT coal production in 2019/20
# This is multiplied by desired sample per distance strata (67) to define total
# sample per sub-strata (>= | < 5MT coal production).
samplesizes <- sampleframe %>% 
  tibble() %>% 
  group_by(distancestrata, samplestrata) %>% 
  summarise(villages = n(),
            hhs_11 = sum(hhs_11)) %>% 
  mutate(sharevill = round(villages / sum(villages),2),
         sharehhs = round(hhs_11 / (sum(hhs_11)),2),
         samplebyhhshare = round(67 * sharehhs)) 

write.table(samplesizes, "clipboard", sep="\t", row.names=FALSE, col.names=TRUE)

# Proportional PPS sampling by population --------------------------------------

set.seed(20210719) # Set seed by date of first random draw

sampled_villages <- sampleframe %>% 
  tibble() %>% 
  head(0) %>% 
  mutate(pi = NULL)

for (i in samplesizes$samplestrata) {
  
  samplesize <- samplesizes %>% 
    filter(samplestrata %in% i) %>% 
    pull(samplebyhhshare)
  
  sample <- sampleframe %>% 
    tibble() %>% 
    filter(samplestrata == i) %>% 
    slice(sample(1:n())) %>% 
    ungroup()
  
  # Selecting an extra 20 per strata in case of issues
  sample_indices <- samplingbook::pps.sampling(z =sample$pop_11, n = samplesize + 20, 
                                               method = "midzuno")
  
  sample <- sample[sample_indices$sample,]
  sample$pi <- sample_indices$pik[sample_indices$sample] # probability of inclusion
  
  sampled_villages <- bind_rows(sampled_villages, sample)
}

# Select buffer villages in case of sampling issues and remove from sample -----
sampled_villages_buffer <- sampled_villages %>% 
  group_by(samplestrata) %>% 
  slice(tail(row_number(),20))

sampled_villages <- sampled_villages %>% 
  filter(!shrid %in% sampled_villages_buffer$shrid)

sampled_villages_tbl <- sampled_villages %>% 
  tibble() %>% 
  rowwise() %>% 
  mutate(longitude = st_coordinates(geometry)[1],
         latitude = st_coordinates(geometry)[2]
  ) %>% 
  select(shrid, CENSUS2011, state, district, subdistrict,
         distancestrata, samplestrata, matches("coalprod"),
         pop_11, pi, longitude, latitude)

readr::write_csv(sampled_villages_tbl, here("Data", "sample.csv"))

sampled_villages_buffer_tbl <- sampled_villages_buffer %>% 
  tibble() %>% 
  rowwise() %>% 
  mutate(longitude = st_coordinates(geometry)[1],
         latitude = st_coordinates(geometry)[2]
  ) %>% 
  select(shrid, CENSUS2011, state, district, subdistrict,
         distancestrata, samplestrata, matches("coalprod"),
         pop_11, pi, longitude, latitude)

readr::write_csv(sampled_villages_buffer_tbl, here("Data", "sample_buffer.csv"))
